*-------------------------------------------------------------------------------
/* title : Import and prepare raw firm-level and other relevant datasets 
   source: ORBIS-Amadeus (proprietary firm-level data)	 
   input : $data/...Orbis data for each country
   output: $temp/Raw.dta
		   $temp/Selection_criteria */
*-------------------------------------------------------------------------------

*===============================================================================
* Set up
*===============================================================================
*--------------------------SET-UP & READ DIRECTORIES----------------------------
cd												// Current directory of code
do "[0.0] directories.do"

*===============================================================================
* Read and keep relevant information
*===============================================================================
qui foreach country in AT BE BG CZ DE EE ES FI FR HR HU IT NO PL PT RO SE SI SK {

* Load raw dataset and keeping only relevant variables
use bvdid year naceR2_2digit tangiblefixedassets numberofemployees materialcosts 	///
	sales *deflatorR112 legalstatus consolidation shh_country shh_isfirm shh_perc 	///
	guo_country guo_isfirm guo_perc operatingrevenueturnover 						///
	using "$data/orbis_amadeus/`country'.dta", clear
	 
gen country = "`country'" 						// Country indicator

*===============================================================================
* Keep relevant periods and sectors for matching
*===============================================================================
keep if inrange(year, 2000, 2014) 				// Time period
keep if inrange(naceR2_2digit, 10, 33) 			// Manufacturing sector (nace rev.2 2-digit codes)

*===============================================================================
* Keep unconsolidated accounts and active firms
*===============================================================================
gegen id = group(bvdid)
xtset id year

*-----------------------Statistics for selection criteria-----------------------
* Unconsolidated accounts or those with no info: http://zeerovery.nl/blogfiles/Blog-AmaConsh.gif
preserve
	local criterion consolidation
	xtsum sales
	gen N0_`criterion' = r(N) 					// #of observations
	gen F0_`criterion' = r(n) 					// #of firms
	sum sales
	gen Y0_`criterion' = r(sum) 				// sales
	sum numberofemployees
	gen L0_`criterion' = r(sum) 				// labour units
	keep if inlist(consolidation, "U1", "U2", "")
	xtsum sales
	gen N1_`criterion' = r(N) 					// #of observations
	gen F1_`criterion' = r(n) 					// #of firms
	sum sales
	gen Y1_`criterion' = r(sum) 				// sales
	sum numberofemployees
	gen L1_`criterion' = r(sum) 				// labour units
	keep country *_`criterion'
	bysort country: keep if _n==1
	save "$temp/Selection_criteria_`criterion'`country'.dta", replace
restore

*-----------------------Statistics for selection criteria-----------------------
* Active firms or those for which no info is provided
preserve
	local criterion legalstatus
	xtsum sales
	gen N0_`criterion' = r(N) 					// #of observations
	gen F0_`criterion' = r(n) 					// #of firms
	sum sales
	gen Y0_`criterion' = r(sum) 				// sales
	sum numberofemployees
	gen L0_`criterion' = r(sum) 				// labour units
	keep if inlist(legalstatus, "Active", "Active (dormant)", "Active (insolvency proceedings)", "Active (receivership)", "Unknown", "Active (default of payments)", "") 
	xtsum sales
	gen N1_`criterion' = r(N) 					// #of observations
	gen F1_`criterion' = r(n) 					// #of firms
	sum sales
	gen Y1_`criterion' = r(sum) 				// sales
	sum numberofemployees
	gen L1_`criterion' = r(sum) 				// labour units
	keep country *_`criterion'
	bysort country: keep if _n==1
	save "$temp/Selection_criteria_`criterion'`country'.dta", replace
restore

*-------------------------------------------------------------------------------
keep if inlist(consolidation, "U1", "U2", "") 

keep if inlist(legalstatus, "Active", "Active (dormant)", "Active (insolvency proceedings)", "Active (receivership)", "Unknown", "Active (default of payments)", "") 

*===============================================================================
* Drop missing or negative values for variables used in estimation
*===============================================================================
*-----------------------Statistics for selection criteria-----------------------
preserve
	local criterion missing
	xtsum sales
	gen N0_`criterion' = r(N) 					// #of observations
	gen F0_`criterion' = r(n) 					// #of firms
	sum sales
	gen Y0_`criterion' = r(sum) 				// sales
	sum numberofemployees
	gen L0_`criterion' = r(sum) 				// labour units
	foreach var of varlist sales tangiblefixedassets materialcosts numberofemployees { 
		drop if `var'<=0 | missing(`var')
	}
	xtsum sales
	gen N1_`criterion' = r(N) 					// #of observations
	gen F1_`criterion' = r(n) 					// #of firms
	sum sales
	gen Y1_`criterion' = r(sum) 				// sales
	sum numberofemployees
	gen L1_`criterion' = r(sum) 				// labour units
	keep country *_`criterion'
	bysort country: keep if _n==1
	save "$temp/Selection_criteria_`criterion'`country'.dta", replace
restore

*-------------------------------------------------------------------------------
foreach type of var sales tangiblefixedassets materialcosts numberofemployees { 
	drop if `type'<=0 | missing(`type')
}
*===============================================================================
* Industry classifications
*===============================================================================
*------------------------------NACE2-DIGIT REV.2--------------------------------
rename naceR2_2digit nace2 						// 2-digit nace code

*--------------------------ISIC REV. 4 of WIOD (CPA)----------------------------
merge m:1 nace2 using "$data/NACE2_ISIC4_conversion.dta", nogen keep(master matched)
drop IndustryDescription IndustryCode nace2

*===============================================================================
* Check for dataset irregularities
*===============================================================================
gisid id year 									// Check for repeated firms within a year
gduplicates report id year
gduplicates drop id year, force

*===============================================================================
* For EU Representativeness
*===============================================================================
/* When we focus on a sample of firms with on average>20 L (as in e.g. Gal, 2013
   and CompNet, 2014), the sample becomes more balanced in cross-country terms.
   See Merlevede, B., M. de Zwaan, K. Lenaerts, and V. Purice (2015) */
*-----------------------Statistics for selection criteria-----------------------
preserve
	local criterion L20
	xtsum sales
	gen N0_`criterion' = r(N) 					// #of observations
	gen F0_`criterion' = r(n) 					// #of firms
	sum sales
	gen Y0_`criterion' = r(sum) 				// sales
	sum numberofemployees
	gen L0_`criterion' = r(sum) 				// labour units
	gegen AVGnumberofemployees = mean(numberofemployees), by(id)
	keep if AVGnumberofemployees>20
	drop AVGnumberofemployees
	xtsum sales
	gen N1_`criterion' = r(N) 					// #of observations
	gen F1_`criterion' = r(n) 					// #of firms
	sum sales
	gen Y1_`criterion' = r(sum) 				// sales
	sum numberofemployees
	gen L1_`criterion' = r(sum) 				// labour units
	keep country *_`criterion'
	bysort country: keep if _n==1
	save "$temp/Selection_criteria_`criterion'`country'.dta", replace
restore
*-------------------------------------------------------------------------------
gegen AVGnumberofemployees = mean(numberofemployees), by(id)
keep if AVGnumberofemployees>20
drop AVGnumberofemployees
*===============================================================================
* Rename deflators
*===============================================================================
rename output_deflatorR112 	OUTPUT_deflator
rename cap_deflatorR112 	TNGFA_deflator
rename mat_deflatorR112 	MCOST_deflator

*===============================================================================
* Check that deflators are unique for each country-industry-year pair
*===============================================================================
foreach var of varlist OUTPUT_deflator TNGFA_deflator MCOST_deflator {
	gegen temp = mean(`var'), by(country CPA year) // Average deflator: unique in each country-industry-year pair
	replace `var' = temp
	drop temp 
}

*===============================================================================
* Key variables in monetary units
*===============================================================================
rename operatingrev*		OPREV
rename sales 				SALES
rename tangiblefixedassets 	TNGFA
rename materialcosts 		MCOST
rename numberofemployees 	L

*===============================================================================
* Deflate monetary variables with industry-country-year deflators (perf. comp.!?)
*===============================================================================
gen Y = SALES/OUTPUT_deflator
gen K = TNGFA/TNGFA_deflator
gen M = MCOST/MCOST_deflator
gen VA = Y - M
gen SHARE = MCOST/SALES

*===============================================================================
* Create logs for estimation variables and name them in lower case letters
*===============================================================================
foreach var of varlist Y VA K L M SHARE {
	cap gen ln`var' = ln(`var')
	rename ln`var' `=lower("`var'")'
}

*===============================================================================
* WIOD ISO country code (for merging proxies below)
*===============================================================================
gen countryWIOD = ""
replace countryWIOD = "AUT" if country=="AT"
replace countryWIOD = "BEL" if country=="BE"
replace countryWIOD = "BGR" if country=="BG"
replace countryWIOD = "CZE" if country=="CZ"
replace countryWIOD = "DEU" if country=="DE"
replace countryWIOD = "ESP" if country=="ES"
replace countryWIOD = "EST" if country=="EE"
replace countryWIOD = "FIN" if country=="FI"
replace countryWIOD = "FRA" if country=="FR"
replace countryWIOD = "HRV" if country=="HR"
replace countryWIOD = "HUN" if country=="HU"
replace countryWIOD = "ITA" if country=="IT"
replace countryWIOD = "NOR" if country=="NO"
replace countryWIOD = "POL" if country=="PL"
replace countryWIOD = "PRT" if country=="PT"
replace countryWIOD = "ROU" if country=="RO"
replace countryWIOD = "SVK" if country=="SK"
replace countryWIOD = "SVN" if country=="SI"
replace countryWIOD = "SWE" if country=="SE"

*===============================================================================
* Merge Proxy dataset
*===============================================================================
gen country_backup = country
gen industry = CPA
replace country = countryWIOD
merge m:1 country industry year using "$temp/Proxies.dta", nogen keep(matched) keepusing(down0dIM up0dEX up0dIM down0dEX IM EX)
replace country = country_backup
drop country_backup industry

*===============================================================================
* Ownership links
*===============================================================================
gen shh = shh_isfirm
replace shh = 0 if missing(shh_isfirm)
label variable shh "Firm has a shh that owns >10%"

gen sub = guo_isfirm
replace sub = 0 if missing(guo_isfirm)
replace sub = 0 if guo_perc<=10
label variable sub "Firm has a sub that owns >10%"

gen sub_country = guo_country
foreach name of var shh sub { 
	gen `name'_dom = 0
	replace `name'_dom = 1 if country==`name'_country
	replace `name'_dom = `name'_dom*`name'
	label variable `name'_dom "Firm has a domestic `name' that owns >10%"
	gen `name'_for = 0
	replace `name'_for = 1 if country!=`name'_country
	replace `name'_for = `name'_for*`name'
	label variable `name'_for "Firm has a foreign `name' that owns >10%"
}

*===============================================================================
* Check and drop missing or negative values for variables used in estimation 
*===============================================================================
/* Missing logs suggest missing deflators as we already dropped missing monetary 
   variables. Usually, missing material deflator in non-manufacturing sectors*/
noi drop if missing(y, va, k, l, m) 

*-------------------------------------------------------------------------------
drop legalstatus consolidation shh_country shh_isfirm shh_perc guo_country 	///
	 guo_isfirm guo_perc sub shh sub_country OUTPUT_deflator TNGFA_deflator ///
	 MCOST_deflator //SALES TNGFA MCOST K L M

*-------------------------------------------------------------------------------
save "$temp/Raw_`country'", replace
}

*===============================================================================
* Append country-specific datasets to one Eu-level sample
*===============================================================================
foreach country in AT BE BG CZ DE EE ES FI FR HR HU IT NO PL PT RO SE SI SK {
	if country=="AT" use "$temp/Raw_AT", clear
	else 			 append using "$temp/Raw_`country'", force
	erase "$temp/Raw_`country'.dta"
}

*===============================================================================
* Final Checks
*===============================================================================
*-------------------------------Firm Identifiers--------------------------------
drop id 
gegen id = group(bvdid)

*-------------------------------For Bootstraping--------------------------------
gen newid = id 									// Generate newid for the bootstrap process in order to define newid for each sample draw!
qui sum year
gen trend = year - `r(min)' + 1 				// Linear trend
gen const = 1 									// Gobal constant variable
*------------------------For Clustered Standard Errors--------------------------
gegen Cc  = group(country) 						// Country-Cluster variable
gegen Cj  = group(CPA) 							// Industry-Cluster variable
gegen Ccj = group(country CPA) 					// Country-Industry-Cluster variable
foreach i in c j cj {
	gen newcluster_`i' = C`i' 					// Generate newcluster for the bootstrap process in order to define newcluster for each sample draw!
	gen G`i' = C`i' 							// Generate group indicators for factorial FE
}

gegen Gt  = group(year) 						// Year group indicator for factorial FE
gegen Gct = group(country year) 				// Country-year group indicator for factorial FE
gegen Gjt = group($indlev year) 				// Industry-year group indicator for factorial FE

*---------------------------Check for duplicates--------------------------------
gduplicates report id year
gduplicates drop id year, force

*===============================================================================
* Create lag (of first-differenced) values
*===============================================================================
xtset newid year 								// Set panel structure

qui foreach var of varlist l k m Y y sub_dom shh_dom sub_for shh_for {
	* Lags
	gen L1_`var' = L.`var'
	gen L2_`var' = L2.`var'
	
	* First-differences
	gen D_`var'   = D.`var'
	gen DL1_`var' = D.L1_`var'
}

*===============================================================================
* Generate lags for key variables (country-industry-year-level)
*===============================================================================
qui foreach var of varlist down0dIM up0dEX up0dIM down0dEX IM EX {
	* Lags
	forvalues i = 1/2 {
		gen L`i'_`var' = L`i'.`var'
	}
	
	* First-differences
	gen D`var'  = D.`var'
	forvalues i = 1/12 {
 		gen DL`i'_`var' = L`i'.D`var'
	}
	drop D`var'
}

*===============================================================================
* Save Baseline raw dataset
*===============================================================================
xtset newid year
qui compress
saveold "$temp/Raw", replace v(13)

*===============================================================================
* Combine Statistics for sample selection criteria
*===============================================================================

* Merge criteria
qui foreach country in AT BE BG CZ DE EE ES FI FR HR HU IT NO PL PT RO SE SI SK {
	use "$temp/Selection_criteria_consolidation`country'.dta", clear
	merge 1:1 country using "$temp/Selection_criteria_legalstatus`country'.dta", nogen
	merge 1:1 country using "$temp/Selection_criteria_missing`country'.dta", nogen
	merge 1:1 country using "$temp/Selection_criteria_L20`country'.dta", nogen
	qui compress
	save "$temp/Selection_criteria_`country'.dta", replace
}
shell erase "$temp\Selection_criteria_consolidation*.dta"
shell erase "$temp\Selection_criteria_legalstatus*.dta"
shell erase "$temp\Selection_criteria_missing*.dta"
shell erase "$temp\Selection_criteria_L20*.dta"

* Append criteria for all countries
qui foreach country in AT BE BG CZ DE EE ES FI FR HR HU IT NO PL PT RO SE SI SK {
	if country=="AT" use "$temp/Selection_criteria_AT", clear
	else 			 append using "$temp/Selection_criteria_`country'.dta"
}
shell erase "$temp\Selection_criteria_*.dta"

* Save relevant criteria
gen all = 1
gcollapse (sum) N* F* Y* L*, by(all)
drop all
gen country = "EU"
qui compress
save "$temp/Selection_criteria.dta", replace
